manhattan_rides_df <- read_csv("manhattan_rides.csv")
manhattan_rides_df <-
manhattan_rides_df %>%
mutate(
day_of_week = factor(day_of_week, ordered = T,
levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
year = factor(year),
age_group = factor(age_group, ordered = T,
levels = c("18-25","26-35", "36-45", "46-55", "56-65", "66-85")),
gender = type.convert(gender, as.is = F))
Overall and as expected, there were more CitiBike rides in 2019 beginning in the peak months of the pandemic in 2020. FIX AXIS
manhattan_rides_df %>%
group_by(start_date, year) %>%
summarize(obs = n()) %>%
ggplot(aes(x = start_date, y = obs, group = year, color = year)) +
geom_line() +
geom_smooth(se = FALSE)

manhattan_rides_df %>%
group_by(year) %>%
mutate(
month = month(starttime, label = T)
) %>%
filter(tripduration < 2500) %>%
plot_ly(
x = ~month,
y = ~trip_min,
color = ~year,
type = "box") %>%
layout(
boxmode = "group",
title = "Duration of Citibike Rides by Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Trip Duration in Minutes")
)
Looks like maybe the overall length of trips in 2019 was more consistent. 2020 had a bump in duration of rides, starting in April. Overall, trip length seems more variable in 2020.
manhattan_rides_df %>%
group_by(year) %>%
mutate(
month = month(starttime, label = T)
) %>%
group_by(year, month) %>%
summarise(obs = n()) %>%
plot_ly(
x = ~month,
y = ~obs,
color = ~year,
type = "scatter",
mode = "lines") %>%
layout(
title = "Number of Citibike Rides per Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Rides")
)
Huge drop in monthly trips in April 2020. Lockdown started mid/late March so this coincides with people transitioning to WFH and largely staying inside to minimize contacts. The ride numbers bounce back quite a bit after this but not to 2019 levels.
# the code below saves trip_min, gender, age_group variables of manhattan_rides_df
trip_dur_age_gender_df <-
manhattan_rides_df %>%
select(trip_min, gender, age_group)
# looking at the data we can tell there are massive outliers in trip duration
# so i am going to filter them out using the IQR method
Q1 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.25)
Q3 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.75)
inter_quart <- IQR(pull(trip_dur_age_gender_df, trip_min))
trip_dur_age_gender_df <-
trip_dur_age_gender_df %>%
filter(
trip_min >= Q1 - 1.5*inter_quart,
trip_min <= Q3 + 1.5*inter_quart
)
#this generates a plot of boxplots of trip duration (minutes) by gender and age_group
trip_dur_age_gender_df %>%
mutate(
gender = str_to_sentence(gender)
) %>%
plot_ly(x = ~age_group, y = ~trip_min, color = ~gender, type = "box", colors = "viridis") %>%
layout(
boxmode = "group",
xaxis = list(title = "Age Range"),
yaxis = list(title = "Trip Duration (min)"),
legend = list(title = list(text = "<b> Gender </b>"))
)
# this dataframe groups rides by gender and month
# and provides the average age for each gender in that month
# along with the standard deviation, standard error, lower bound, and upper bound
# which are then used to create a plotly graph where we get average age per month for each gender
# with 95% confidence bands around each line
avg_age_per_month_df <-
read_csv("manhattan_rides.csv") %>%
mutate(
date = floor_date(as_date(starttime), "month")
) %>%
select(date, gender, age) %>%
group_by(date, gender) %>%
summarize(
total = n(),
avg_age = mean(age),
sd_age = sd(age)
) %>%
mutate(
sem = sd_age/sqrt(total - 1),
lower_bound = avg_age + qt(0.025, df = total - 1) * sem,
upper_bound = avg_age - qt(0.025, df = total - 1) * sem
) %>%
ungroup()
avg_age_plot <-
avg_age_per_month_df %>%
mutate(
gender = str_to_sentence(gender)
) %>%
ggplot(aes(x = date, y = avg_age, color = gender)) +
geom_line(size = 1, alpha = 0.8) +
geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound), alpha = 0.2)
ggplotly(avg_age_plot) %>%
layout(
xaxis = list(title = "Date"),
yaxis = list(title = "Age"),
legend = list(title = list(text = "<b> Gender </b>"))
)